Load Libraries

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(magrittr)
library(ggplot2)
library(forcats)
library(stringr)

Load the Data

The csv can be found at https://catalog.data.gov/dataset/traffic-violations-56dda

#The csv takes several minutes to load
#If the rds has been created, loading that takes far less time
traffic<-data.frame()
if(file.exists("MOCO_traffic.rds")){
  traffic <- readRDS("MOCO_traffic.rds")
} else {
  traffic <- readr::read_csv("Traffic_Violations.csv")
  saveRDS(traffic,"MOCO_traffic.rds")
}
## Parsed with column specification:
## cols(
##   .default = col_character(),
##   `Time Of Stop` = col_time(format = ""),
##   Latitude = col_double(),
##   Longitude = col_double(),
##   Year = col_double()
## )
## See spec(...) for full column specifications.

Format the Data

#convert the column names to camel case
names(traffic) %<>% str_replace_all(" ","_") %>% tolower()
#convert dates and times and convert yes/no to TRUE/FALSE
traffic %<>% mutate(date_of_stop = lubridate::mdy(date_of_stop), time_of_stop = lubridate::hms(time_of_stop), accident = accident == "Yes", belts = belts == "Yes", personal_injury = personal_injury == "Yes", property_damage = property_damage == "Yes", fatal = fatal == "Yes", commercial_license = commercial_license == "Yes", hazmat = hazmat == "Yes", commercial_vehicle = commercial_vehicle == "Yes", alcohol = alcohol == "Yes", work_zone = work_zone == "Yes", contributed_to_accident = contributed_to_accident == "Yes") 
#drop geolocation column which won't be used
traffic %<>% select(-geolocation)
#Add additional columns for the year, month, and day of the violation
traffic %<>% mutate(year_of_stop = lubridate::year(date_of_stop), month_of_stop = lubridate::month(date_of_stop), month_year = lubridate::make_date(year = year_of_stop, month = month_of_stop), day_of_stop = lubridate::day(date_of_stop))
#remove current year so that only complete years are included in the data
traffic%<>%filter(year_of_stop != lubridate::year(Sys.Date()))

Take a Look at the Data

knitr::kable(head(traffic))
date_of_stop time_of_stop agency subagency description location latitude longitude accident belts personal_injury property_damage fatal commercial_license hazmat commercial_vehicle alcohol work_zone state vehicletype year make model color violation_type charge article contributed_to_accident race gender driver_city driver_state dl_state arrest_type year_of_stop month_of_stop month_year day_of_stop
2013-09-24 17H 11M 0S MCP 3rd district, Silver Spring DRIVING VEHICLE ON HIGHWAY WITH SUSPENDED REGISTRATION 8804 FLOWER AVE NA NA FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE MD 02 - Automobile 2008 FORD 4S BLACK Citation 13-401(h) Transportation Article FALSE BLACK M TAKOMA PARK MD MD A - Marked Patrol 2013 9 2013-09-01 24
2017-08-29 10H 19M 0S MCP 2nd district, Bethesda DRIVER FAILURE TO OBEY PROPERLY PLACED TRAFFIC CONTROL DEVICE INSTRUCTIONS WISCONSIN AVE@ ELM ST 38.98172 -77.09276 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE VA 02 - Automobile 2001 TOYOTA COROLLA GREEN Citation 21-201(a1) Transportation Article FALSE WHITE F FAIRFAX STATION VA VA A - Marked Patrol 2017 8 2017-08-01 29
2014-12-01 12H 52M 0S MCP 6th district, Gaithersburg / Montgomery Village FAILURE STOP AND YIELD AT THRU HWY CHRISTOPHER AVE/MONTGOMERY VILLAGE AVE 39.16289 -77.22909 FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE MD 02 - Automobile 2001 HONDA ACCORD SILVER Citation 21-403(b) Transportation Article FALSE BLACK F UPPER MARLBORO MD MD A - Marked Patrol 2014 12 2014-12-01 1
2017-08-29 9H 22M 0S MCP 3rd district, Silver Spring FAILURE YIELD RIGHT OF WAY ON U TURN CHERRY HILL RD./CALVERTON BLVD. 39.05698 -76.95463 FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE MD 02 - Automobile 1998 DODG DAKOTA WHITE Citation 21-402(b) Transportation Article FALSE BLACK M FORT WASHINGTON MD MD A - Marked Patrol 2017 8 2017-08-01 29
2017-08-28 23H 41M 0S MCP 6th district, Gaithersburg / Montgomery Village FAILURE OF DR. TO MAKE LANE CHANGE TO AVAIL. LANE NOT IMMED. ADJ. TO STOPPED EMERG. VEH, 355 @ SOUTH WESTLAND DRIVE NA NA FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE MD 02 - Automobile 2015 MINI COOPER 2S WHITE Citation 21-405(e1) Transportation Article FALSE WHITE M GAITHERSBURG MD MD A - Marked Patrol 2017 8 2017-08-01 28
2013-08-27 55M 0S MCP 2nd district, Bethesda NEGLIGENT DRIVING VEHICLE IN CARELESS AND IMPRUDENT MANNER ENDANGERING PROPERTY, LIFE AND PERSON CONNECTICUT/CHEVY CHASE LAKE NA NA FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE MD 02 - Automobile 2013 HYUNDAI ELANTRA GRAY Citation 21-901.1(b) Transportation Article FALSE WHITE F SILVER SPRING MD MD A - Marked Patrol 2013 8 2013-08-01 27
str(traffic)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 1437177 obs. of  38 variables:
##  $ date_of_stop           : Date, format: "2013-09-24" "2017-08-29" ...
##  $ time_of_stop           :Formal class 'Period' [package "lubridate"] with 6 slots
##   .. ..@ .Data : num  0 0 0 0 0 0 0 0 0 0 ...
##   .. ..@ year  : num  0 0 0 0 0 0 0 0 0 0 ...
##   .. ..@ month : num  0 0 0 0 0 0 0 0 0 0 ...
##   .. ..@ day   : num  0 0 0 0 0 0 0 0 0 0 ...
##   .. ..@ hour  : num  17 10 12 9 23 0 13 0 23 23 ...
##   .. ..@ minute: num  11 19 52 22 41 55 23 38 41 41 ...
##  $ agency                 : chr  "MCP" "MCP" "MCP" "MCP" ...
##  $ subagency              : chr  "3rd district, Silver Spring" "2nd district, Bethesda" "6th district, Gaithersburg / Montgomery Village" "3rd district, Silver Spring" ...
##  $ description            : chr  "DRIVING VEHICLE ON HIGHWAY WITH SUSPENDED REGISTRATION" "DRIVER FAILURE TO OBEY PROPERLY PLACED TRAFFIC CONTROL DEVICE INSTRUCTIONS" "FAILURE STOP AND YIELD AT THRU HWY" "FAILURE YIELD RIGHT OF WAY ON U TURN" ...
##  $ location               : chr  "8804 FLOWER AVE" "WISCONSIN AVE@ ELM ST" "CHRISTOPHER AVE/MONTGOMERY VILLAGE AVE" "CHERRY HILL RD./CALVERTON BLVD." ...
##  $ latitude               : num  NA 39 39.2 39.1 NA ...
##  $ longitude              : num  NA -77.1 -77.2 -77 NA ...
##  $ accident               : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ belts                  : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ personal_injury        : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ property_damage        : logi  FALSE FALSE TRUE TRUE FALSE FALSE ...
##  $ fatal                  : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ commercial_license     : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ hazmat                 : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ commercial_vehicle     : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ alcohol                : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ work_zone              : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ state                  : chr  "MD" "VA" "MD" "MD" ...
##  $ vehicletype            : chr  "02 - Automobile" "02 - Automobile" "02 - Automobile" "02 - Automobile" ...
##  $ year                   : num  2008 2001 2001 1998 2015 ...
##  $ make                   : chr  "FORD" "TOYOTA" "HONDA" "DODG" ...
##  $ model                  : chr  "4S" "COROLLA" "ACCORD" "DAKOTA" ...
##  $ color                  : chr  "BLACK" "GREEN" "SILVER" "WHITE" ...
##  $ violation_type         : chr  "Citation" "Citation" "Citation" "Citation" ...
##  $ charge                 : chr  "13-401(h)" "21-201(a1)" "21-403(b)" "21-402(b)" ...
##  $ article                : chr  "Transportation Article" "Transportation Article" "Transportation Article" "Transportation Article" ...
##  $ contributed_to_accident: logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ race                   : chr  "BLACK" "WHITE" "BLACK" "BLACK" ...
##  $ gender                 : chr  "M" "F" "F" "M" ...
##  $ driver_city            : chr  "TAKOMA PARK" "FAIRFAX STATION" "UPPER MARLBORO" "FORT WASHINGTON" ...
##  $ driver_state           : chr  "MD" "VA" "MD" "MD" ...
##  $ dl_state               : chr  "MD" "VA" "MD" "MD" ...
##  $ arrest_type            : chr  "A - Marked Patrol" "A - Marked Patrol" "A - Marked Patrol" "A - Marked Patrol" ...
##  $ year_of_stop           : num  2013 2017 2014 2017 2017 ...
##  $ month_of_stop          : num  9 8 12 8 8 8 10 4 8 8 ...
##  $ month_year             : Date, format: "2013-09-01" "2017-08-01" ...
##  $ day_of_stop            : int  24 29 1 29 28 27 8 24 28 28 ...
ggplot(data = traffic) + geom_bar(aes(x = year_of_stop))+ggtitle("Traffic Violations by Year")+xlab("Year")+ylab("Violations")

plot_data <- traffic %>% group_by(month_year) %>% summarise(number_of_stops = n()) %>% ungroup() %>% mutate(month_of_stop = lubridate::month(month_year))
#create plot
plotly::ggplotly(ggplot(data = plot_data)+geom_col(aes(x = month_year, y = number_of_stops, fill = month_of_stop))+theme(legend.position = "none")+ggtitle("Traffic Violations over Time")+xlab("Time")+ylab("Number of Stops"))

Are stops more likely at certain points of the month?

plot_data <- traffic %>% mutate(point_in_month = (lubridate::period_to_seconds(lubridate::days(day_of_stop-1)) + lubridate::period_to_seconds(time_of_stop))/lubridate::period_to_seconds(lubridate::days(lubridate::days_in_month(date_of_stop))),point_in_month = (findInterval(point_in_month,(0:30)/30, all.inside = TRUE)-1)/30)

ggplot(data = plot_data)+geom_bar(aes(x = point_in_month))+ggtitle("Are Violations More Likely at Certain Times of a Month")+xlab("Time of Month")+ylab("Violations")

clock_hours <- c(12,1:11)
clock_minutes <- c(paste0(0,0:9),10:59)
clock_am_pm <-c("AM","PM")
clock_order<-character(length(clock_hours)*length(clock_minutes)*length(clock_am_pm))
index<-1
for(c_suffix in clock_am_pm){
  for(c_hour in clock_hours){
    for(c_minute in clock_minutes){
      clock_order[index]<-paste0(c_hour,":",c_minute,c_suffix)
      index<-index+1
    }
  }
}
plot_data <- traffic%>%mutate(minute = lubridate::period_to_seconds(time_of_stop)/60)
plotly::ggplotly(ggplot(data = plot_data) + geom_bar(aes(x = minute, text = clock_order[minute+1]))+ xlab("Time") + ylab("Number of Stops")+ggtitle("Number of Stops by Time of Day") + scale_x_continuous(breaks = (0:23)*60, labels = c("12:00AM", paste0(1:11,":00AM"), "12:00PM", paste0(1:11, ":00PM")))+ theme(axis.text.x = element_text(angle=90,hjust=1)))
## Warning: Ignoring unknown aesthetics: text
#these are the furtest latitude/longitude for Montgomery County in any given direction, based on Google Maps
north <- 39.36
south <- 38.92
west <- -77.55
east <- -76.88
ggplot(data = traffic%>%filter(longitude>west,longitude<east,latitude<north,latitude>south)) + geom_point(aes(x = longitude, y = latitude),alpha = .2, size = .1)+ggtitle("Where do Violations Occur?")

Write up

For this exam, I worked with the data set of traffic violationis in Montgomery County, Maryland. The dataset was found at https://catalog.data.gov/dataset/traffic-violations-56dda. I dropped the data from 2019, which is only partially complete. The data was mostly clean from the site, but I converted the dates and times using the lubridate package and converted the character Yes/No values into boolean TRUE/FALSE values. I chose this data set because I thought that it would be interesting to see if there are any trends in traffic enforcement/violations in the county. Overall, I didn’t find too much that I would consider to be surprising. The peak in violations around midnight isn’t unexpected. It is likely explained by an increase in drunk drivers in addition to the violations that would be regularly commited at any given time.

I tried a number of visualizations. First I checked the number of violations recorded each year. The number of violations increased each year between 2012 and 2015, but has since leveled out and the number of violations are slightly down. Next I made the same graph, but upped the resolution to monthly. It showed mostly the same thing, but also revealed that there were a few months with random spikes in violations. I couldn’t see any pattern in these months however. The next graph breaks down the violations by the point in the month at which they occurred. The goal of this graph was to see if monthly quotas were used that may cause an increase in violations at certain points in the month as officers try to meet the quota. The graph showed that violations were consistent at every point of the month, with no significant spikes or dips. Next, I looked at the time of day at which violations were issued. This graph showed a significant variation in the rate at which violations were issued depending on the time of day. Signifcantly more violations are issued at night, peaking around midnight. The fewest violations are issued around 5am, likely because much fewer drivers are on the road. Finally, I created a scatter plot based on the longitude and latitude of the violations. This more or less created a map of the county’s most traveled roads. I had to filter out some of the longitude and latitude data which was well outside of the county’s boundaries. There were a handful of data points that had longitude and latitudes which indicated that the violation ocurred hundreds of miles away from Montgomery County, including a few in the middle of the Atlantic Ocean. I found this graph to be pretty interesting, without using any special geospatial graphing, the county’s boundaries are pretty clear, and the locations of the bigger cities and towns are quite visible.